Cargo Librerias

library(plotly)
library(rgl)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(ppcor)
library(GGally)
library(corrplot)
library(corrplot)
library(utf8)
library(arules) 
library(arulesViz) 

Los Datos de IMDB

load("/home/andresfaral/Dropbox/LABO CERO/pelis")
dim(peliculas.df)
## [1] 242528     13
peliculas.df %>% arrange(-Votes*Year)
## Warning in -Votes * Year: NAs produced by integer overflow

Cantidad de Peliculas por aƱo

resu<-peliculas.df %>% filter(Year<2019) %>% group_by(Year) %>% summarise(Cantidad=n())
ggplot(data = resu, aes(x = Year, y = Cantidad)) +
  geom_point(color = 'red') 

Empecemos por filtrar aquellas peliculas con mas de 200 votos. Luego me fijo cuantas filas y columnas me quedaron

peliculas.df = peliculas.df[which(peliculas.df$Votes > 200),] 
dim(peliculas.df)
## [1] 50488    13

Generos de Peliculas

Ahora veo cuales son los gĆ©neros posibles. Como en el campo Genre puedo tener mas de uno separado por coma, primero creo una funcion que me los separa con este criterio, luego le aplico esa funcion a cada elemento Ćŗnico en el campo genres. Luego ā€œaplastoā€ la lista de listas resultante para tener un array de una sola dimension. Finalmente elimino los lagging spaces y vuelvo a obtener los valores Ćŗnicos.

split_genre <-function(genres){strsplit(as.character(genres), ",")}
genres = sapply(unique(peliculas.df$Genre), split_genre,simplify = "array")
generos = unique(trimws(unlist(genres)))
generos = generos [-22]
for (i in 1:length(generos)) {
  peliculas.df[generos[i]] = as.integer(as.logical(grepl(generos[i], trimws(peliculas.df$Genre))))
}
print(generos)
##  [1] "Adventure" "Comedy"    "War"       "Biography" "Western"   "Romance"  
##  [7] "Animation" "Drama"     "Family"    "Fantasy"   "Action"    "Sci-Fi"   
## [13] "Music"     "Thriller"  "Horror"    "History"   "Mystery"   "Crime"    
## [19] "Sport"     "Musical"   "Adult"     "Game-Show" "News"

Grafico de Barras

cant.gen<-as.data.frame(apply(peliculas.df[,14:36],2,sum))
cant.gen<-cant.gen %>% mutate(Genro=rownames(cant.gen))
names(cant.gen)<-c("Cantidad","Genero")
cant.gen<-cant.gen %>% arrange(Cantidad)
ggplot(data=cant.gen, aes(x=reorder(Genero,Cantidad),y=Cantidad, fill = Genero)) +
geom_bar(stat="identity")+
scale_colour_gradient2()+
coord_flip()+
scale_x_discrete(limits = cant.gen$TGenero)+
theme_classic() + labs(x="Genero")

Relacion entre Generos: Reglas de Asociacion

Vaeamos ahora como se relacionan entre sƭ los gƩneros.

datos0<-peliculas.df[c(generos)]
datos0<-datos0==1
trans <- as(datos0, "transactions")
# Criterios
soporte<-0.05 # 0.2
confianza<-0.3 # 0.4
longitud.min<-2 # 2
longitud.max<-2 # 2
# Busco reglas
reglas <- apriori(trans, 
                      parameter = list(supp=soporte,
                                       conf=confianza,
                                       minlen=longitud.min,
                                       maxlen=longitud.max))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.3    0.1    1 none FALSE            TRUE       5    0.05      2
##  maxlen target  ext
##       2  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 2524 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23 item(s), 50488 transaction(s)] done [0.01s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2
## Warning in apriori(trans, parameter = list(supp = soporte, conf = confianza, :
## Mining stopped (maxlen reached). Only patterns up to a length of 2 returned!
##  done [0.00s].
## writing ... [6 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspectDT(reglas)
plot(reglas, method="graph",interactive = T,engine = "htmlwidget")
## Warning in plot.rules(reglas, method = "graph", interactive = T, engine =
## "htmlwidget"): The parameter interactive is deprecated. Use engine='interactive'
## instead.

MƔs Reglas

datos0<-peliculas.df[c(generos)]
datos0<-datos0==1
trans <- as(datos0, "transactions")
# Criterios
soporte<-0.01 # 0.2
confianza<-0.3 # 0.4
longitud.min<-2 # 2
longitud.max<-2 # 2
# Busco reglas
reglas <- apriori(trans, 
                      parameter = list(supp=soporte,
                                       conf=confianza,
                                       minlen=longitud.min,
                                       maxlen=longitud.max))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.3    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target  ext
##       2  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 504 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23 item(s), 50488 transaction(s)] done [0.01s].
## sorting and recoding items ... [19 item(s)] done [0.00s].
## creating transaction tree ... done [0.02s].
## checking subsets of size 1 2
## Warning in apriori(trans, parameter = list(supp = soporte, conf = confianza, :
## Mining stopped (maxlen reached). Only patterns up to a length of 2 returned!
##  done [0.00s].
## writing ... [28 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspectDT(reglas)
plot(reglas, method="graph",interactive = T,engine = "htmlwidget")
## Warning in plot.rules(reglas, method = "graph", interactive = T, engine =
## "htmlwidget"): The parameter interactive is deprecated. Use engine='interactive'
## instead.

Relacion entre Duración y Rating

Dos generos: Drama (ROJO) Horror (CELESTE)

#
subbase<-peliculas.df %>% dplyr::filter(Horror==1|Drama==1)
evol.Dra<-subbase %>% dplyr::filter(Drama==1) %>% group_by(Year) %>% summarise(DurMed=mean(Runtime,na.rm=T),RatMed=mean(Rating,na.rm=T))
evol.Hor<-subbase %>% dplyr::filter(Horror==1) %>% group_by(Year) %>% summarise(DurMed=mean(Runtime,na.rm=T),RatMed=mean(Rating,na.rm=T))
RATING<-subbase$Rating
TIEMPO<-subbase$Year
DURACION<-subbase$Runtime
TITULO<-subbase$Title
#y<-log(subbase$Votes)
tama<-log(subbase$Votes)/10
colores<-ifelse(subbase$Drama==1,"red","cyan")
quegen<-ifelse(subbase$Drama==1,"Drama","Horror")

base.plot<-data.frame(Rating=RATING,Duracion=DURACION,Genero=quegen,Titulo=TITULO)

plot_ly(base.plot, x = ~Rating, y = ~Duracion, color = ~Genero, type = 'scatter', mode = 'markers',
        text = ~Titulo,colors=c("red","cyan"))
## Warning: `arrange_()` was deprecated in dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## Warning: Ignoring 327 observations

GrƔfico Contour

ggplot(base.plot, aes(Rating, Duracion, color = Genero)) +
  geom_point(shape = 16, size = 0.5, show.legend = FALSE,alpha=0.4) +
  theme_minimal() + coord_cartesian(ylim=c(0,200))+ stat_density2d(geom = "polygon",alpha=0.2)
## Warning: Removed 327 rows containing non-finite values (stat_density2d).
## Warning: Removed 327 rows containing missing values (geom_point).

#
ggplot(base.plot, aes(Rating, Duracion, color = Genero)) +  theme_minimal() + coord_cartesian(ylim=c(70,165))+ stat_density2d(geom = "polygon",alpha=0.1) +  scale_alpha_continuous(range = c(0, .1))
## Warning: Removed 327 rows containing non-finite values (stat_density2d).

Relacion entre Año, Duración y Rating Plot 3D

plot3d(RATING,TIEMPO,DURACION,col=colores,zlim=c(60,180),type="p",size=3)
lines3d(evol.Dra$RatMed,evol.Dra$Year,evol.Dra$DurMed,col="black",lwd=5)
lines3d(evol.Hor$RatMed,evol.Hor$Year,evol.Hor$DurMed,col="black",lwd=6)

Evolucion en el tiempo del Rating y la Duracion

get_1genre <-function(genres){ return( trimws(unlist(strsplit(as.character(genres), ","))[1]))}
peliculas.df$Principal_Genre = unlist(sapply(peliculas.df$Genre, get_1genre,simplify = "array"))

peliculas.rec<-peliculas.df %>% dplyr::select(Year,Rating,Runtime,Principal_Genre) %>% na.omit()
calif<-peliculas.rec %>% group_by(Year,Principal_Genre) %>% summarise(Rating=mean(Rating),Duracion=mean(Runtime),Cant=n()) %>% filter(Principal_Genre=="Action"|Principal_Genre=="Drama"|Principal_Genre=="Horror"|Principal_Genre=="Comedy") %>% na.omit()
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
r3 <- plot_ly(calif, x = ~`Duracion`, y = ~`Rating`,
  frame=~Year,size= ~`Cant`,color=~`Principal_Genre`,colors=c("blue","green","red","cyan"),mode="markers",marker = list(symbol = 'circle', sizemode = 'diameter',
                      line = list(width = 2, color = '#FFFFFF'), opacity=0.75)) %>% animation_opts(
    1000, redraw = FALSE
  )


r3
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter